import fitz  # PyMuPDF
from config import PDF_PARSER
from utils.logger import logger

def extract_pages(path: str):
    """将 PDF 按页读取，返回 fitz.Document 对象"""
    return fitz.open(path)

def extract_text_words(page):
    """
    提取页面中精确的字符级文本数据（直接使用底层API）
    返回：[(x0, y0, x1, y1, "char"), ...]
    """
    chars = []
    page_dict = page.get_text("rawdict")
    char_count = 0
    
    for block in page_dict["blocks"]:
        if "lines" not in block:
            continue
        for line in block["lines"]:
            for span in line["spans"]:
                for char_info in span["chars"]:
                    # 直接获取字符边界框
                    bbox = char_info["bbox"]
                    char_text = char_info["c"]
                    
                    chars.append((
                        bbox[0],  # x0
                        bbox[1],  # y0
                        bbox[2],  # x1
                        bbox[3],  # y1
                        char_text
                    ))
                    char_count += 1
    # #logger.info("chars", chars)
    #logger.info(f"字符级提取成功: 找到{char_count}个字符")
    return chars
